{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# [ Chapter 7 - Interpreting Query Intent through Semantic Search ]\n",
    "# Setting up the Reviews Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "\n",
    "import aips.data_loaders.cities as cities\n",
    "import aips.data_loaders.reviews as reviews\n",
    "from aips import get_engine\n",
    "from aips.spark.dataframe import from_csv\n",
    "\n",
    "spark = SparkSession.builder.appName(\"AIPS\").getOrCreate()\n",
    "\n",
    "engine = get_engine()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Download the Datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Already up to date.\n",
      "._reviews.csv\n",
      "reviews.csv\n",
      "entities.csv\n",
      "._cities.csv\n",
      "cities.csv\n"
     ]
    }
   ],
   "source": [
    "#Get datasets\n",
    "![ ! -d 'reviews' ] && git clone --depth 1 https://github.com/ai-powered-search/reviews.git\n",
    "! cd reviews && git pull\n",
    "! cd reviews && mkdir -p '../data/reviews/' && tar -xvf reviews.tgz -C '../data/reviews/' && tar -xvf entities.tgz -C '../data/reviews/' && tar -xvf cities.tgz -C '../data/reviews/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wiping \"reviews\" collection\n",
      "Creating \"reviews\" collection\n",
      "Status: Success\n",
      "\n",
      "Loading Reviews...\n",
      "root\n",
      " |-- id: string (nullable = true)\n",
      " |-- business_name: string (nullable = true)\n",
      " |-- name: string (nullable = true)\n",
      " |-- city: string (nullable = true)\n",
      " |-- state: string (nullable = true)\n",
      " |-- content: string (nullable = true)\n",
      " |-- categories: string (nullable = true)\n",
      " |-- stars_rating: integer (nullable = true)\n",
      " |-- location_coordinates: string (nullable = true)\n",
      "\n",
      "Successfully written 192138 documents\n"
     ]
    }
   ],
   "source": [
    "reviews_collection = get_engine(\"solr\").create_collection(\"reviews\")\n",
    "reviews_data = reviews.load_dataframe(\"data/reviews/reviews.csv\")\n",
    "reviews_collection.write(reviews_data)\n",
    "\n",
    "entities_collection =  get_engine(\"solr\").create_collection(\"entities\")\n",
    "entities_dataframe = from_csv(\"data/reviews/entities.csv\")\n",
    "cities_dataframe = cities.load_dataframe(\"data/reviews/cities.csv\")\n",
    "entities_collection.write(entities_dataframe)\n",
    "entities_collection.write(cities_dataframe)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reviews Dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Listing 7.1\n",
    "### Loading and indexing the reviews dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wiping \"reviews\" collection\n",
      "Creating \"reviews\" collection\n",
      "Status: Success\n",
      "\n",
      "Loading Reviews...\n",
      "root\n",
      " |-- id: string (nullable = true)\n",
      " |-- business_name: string (nullable = true)\n",
      " |-- name: string (nullable = true)\n",
      " |-- city: string (nullable = true)\n",
      " |-- state: string (nullable = true)\n",
      " |-- content: string (nullable = true)\n",
      " |-- categories: string (nullable = true)\n",
      " |-- stars_rating: integer (nullable = true)\n",
      " |-- location_coordinates: string (nullable = true)\n",
      "\n",
      "Successfully written 192138 documents\n"
     ]
    }
   ],
   "source": [
    "reviews_collection = engine.create_collection(\"reviews\")\n",
    "reviews_data = reviews.load_dataframe(\"data/reviews/reviews.csv\")\n",
    "reviews_collection.write(reviews_data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Listing 7.2 and Figure 7.2 - 7.7\n",
    "\n",
    "#### Located in the [Semantic Search Application](2.semantic-search.ipynb#listing-7.2) notebook"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Listing 7.3\n",
    "<a id='listing-7.3'></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def display_entities(dataframe, limit=10):\n",
    "    print(\"Entities\")\n",
    "    dataframe.drop(\"semantic_function\").show(limit, truncate=20)\n",
    "    print(\"... Entities continued\")\n",
    "    dataframe.filter(dataframe.type == \"semantic_function\") \\\n",
    "        .select(\"id\", \"semantic_function\").show(truncate=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Entities\n",
      "+---+--------------------+--------------------+-----------------+----------+\n",
      "| id|        surface_form|      canonical_form|             type|popularity|\n",
      "+---+--------------------+--------------------+-----------------+----------+\n",
      "|  1|                near| {location_distance}|semantic_function|        90|\n",
      "|  2|                  in| {location_distance}|semantic_function|       100|\n",
      "|  3|                  by| {location_distance}|semantic_function|        90|\n",
      "|  4|                  by|{text_within_one_...|semantic_function|        10|\n",
      "|  5|                near|     {text_distance}|semantic_function|        10|\n",
      "|  6|             popular|           {popular}|semantic_function|       100|\n",
      "|  7|                 top|           {popular}|semantic_function|       100|\n",
      "|  8|                best|           {popular}|semantic_function|       100|\n",
      "|  9|                good|           {popular}|semantic_function|       100|\n",
      "| 10|              violet|              violet|            color|       100|\n",
      "| 11|       violet crowne|       violet crowne|            brand|       100|\n",
      "| 12|violet crowne cha...|violet crowne cha...|    movie_theater|       100|\n",
      "| 13|        violet crown|       violet crowne|            brand|       100|\n",
      "| 14|violet crown char...|violet crowne cha...|    movie_theater|       100|\n",
      "| 15|            haystack| haystack conference|            event|       100|\n",
      "| 16|       haystack conf| haystack conference|            event|       100|\n",
      "| 17| haystack conference| haystack conference|            event|       100|\n",
      "| 18|            heystack| haystack conference|            event|       100|\n",
      "| 19|       heystack conf| haystack conference|            event|       100|\n",
      "| 20| heystack conference| haystack conference|            event|       100|\n",
      "+---+--------------------+--------------------+-----------------+----------+\n",
      "only showing top 20 rows\n",
      "\n",
      "... Entities continued\n",
      "+---+----------------------------------------------+\n",
      "|id |semantic_function                             |\n",
      "+---+----------------------------------------------+\n",
      "|1  |location_distance(query, position)            |\n",
      "|2  |location_distance(query, position)            |\n",
      "|3  |location_distance(query, position)            |\n",
      "|4  |text_within_one_edit_distance(query, position)|\n",
      "|5  |text_distance(query, position)                |\n",
      "|6  |popularity(query, position)                   |\n",
      "|7  |popularity(query, position)                   |\n",
      "|8  |popularity(query, position)                   |\n",
      "|9  |popularity(query, position)                   |\n",
      "+---+----------------------------------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "entities_dataframe = from_csv(\"data/reviews/entities.csv\", log=False)\n",
    "display_entities(entities_dataframe, limit=20)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Cities Dataset (Geonames)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Listing 7.4\n",
    "<a id='listing-7.4'></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wiping \"entities\" collection\n",
      "Creating \"entities\" collection\n",
      "Status: Success\n",
      "Loading data/reviews/entities.csv\n",
      "Schema: \n",
      "root\n",
      " |-- id: integer (nullable = true)\n",
      " |-- surface_form: string (nullable = true)\n",
      " |-- canonical_form: string (nullable = true)\n",
      " |-- type: string (nullable = true)\n",
      " |-- popularity: integer (nullable = true)\n",
      " |-- semantic_function: string (nullable = true)\n",
      "\n",
      "Loading Geonames...\n",
      "Successfully written 21 documents\n",
      "Successfully written 137581 documents\n"
     ]
    }
   ],
   "source": [
    "entities_collection = engine.create_collection(\"entities\")\n",
    "entities_dataframe = from_csv(\"data/reviews/entities.csv\")\n",
    "cities_dataframe = cities.load_dataframe(\"data/reviews/cities.csv\")\n",
    "entities_collection.write(entities_dataframe)\n",
    "entities_collection.write(cities_dataframe)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Enities Dataset (Manually-specified Knowledge Graph)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Listing 7.5\n",
    "<a id='listing-7.5'></a>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# %load -s add_tag_type_commands,add_rag_request_handler_config engines/solr/SolrEngine.py\n",
    "add_tag_type_commands = [{\n",
    "    \"add-field-type\": {\n",
    "        \"name\": \"tag\",\n",
    "        \"class\": \"solr.TextField\",\n",
    "        \"postingsFormat\": \"FST50\",\n",
    "        \"omitNorms\": \"true\",\n",
    "        \"omitTermFreqAndPositions\": \"true\",\n",
    "        \"indexAnalyzer\": {\n",
    "            \"tokenizer\": {\"class\": \"solr.StandardTokenizerFactory\"},\n",
    "            \"filters\": [\n",
    "                {\"class\": \"solr.EnglishPossessiveFilterFactory\"},\n",
    "                {\"class\": \"solr.ASCIIFoldingFilterFactory\"},\n",
    "                {\"class\": \"solr.LowerCaseFilterFactory\"},\n",
    "                {\"class\": \"solr.ConcatenateGraphFilterFactory\",\n",
    "                 \"preservePositionIncrements\": \"false\"}]},\n",
    "        \"queryAnalyzer\": {\n",
    "            \"tokenizer\": {\"class\": \"solr.StandardTokenizerFactory\"},\n",
    "            \"filters\": [{\"class\": \"solr.EnglishPossessiveFilterFactory\"},\n",
    "                        {\"class\": \"solr.ASCIIFoldingFilterFactory\"},\n",
    "                        {\"class\": \"solr.LowerCaseFilterFactory\"}]}}\n",
    "    },\n",
    "    {\"add-field\": {\"name\": \"name_tag\", \"type\": \"tag\",\n",
    "                   \"stored\": \"false\"}},\n",
    "    {\"add-copy-field\": {\"source\": \"surface_form\",\n",
    "                        \"dest\": [\"name_tag\"]}}]\n",
    "\n",
    "add_tag_request_handler_config = {\n",
    "        \"add-requesthandler\" : {\n",
    "            \"name\": \"/tag\",\n",
    "            \"class\": \"solr.TaggerRequestHandler\",\n",
    "            \"defaults\": {\n",
    "                \"field\": \"name_tag\",\n",
    "                \"json.nl\": \"map\",\n",
    "                \"sort\": \"popularity desc\",\n",
    "                \"matchText\": \"true\",\n",
    "                \"fl\": \"id,canonical_form,surface_form,type,semantic_function,popularity,country,admin_area,location_coordinates\"\n",
    "            }\n",
    "        }\n",
    "    }"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Success!\n",
    "\n",
    "Now that you've indexed the Reviews Dataset and semantic data, it's time to test our end to end semantic search example!\n",
    "\n",
    "Up next: [Semantic search](2.semantic-search.ipynb)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
